#########################################################################
######### Load Data #############################################
#########################################################################

dat<-read.csv("county-dataNov5.csv")

### Historical variables ###
dat$pctJews<-dat$Jewish/dat$Pop1931
dat$pctCath<-dat$Catholic/dat$Pop1931
dat$pctGrCath<-dat$Gr_Catholi/dat$Pop1931
dat$pctOrth<-dat$Orthodox/dat$Pop1931
dat$pctAug<-(dat$Evan_Augs+dat$Evan_Ref+dat$Evan_Uni+dat$Evan_bez)/dat$Pop1931
dat$pctUrb<-dat$PopUrb1931/dat$Pop1931
dat$pctRoln<-dat$rolnictwoT/dat$Pop1931
dat$pctLit<-dat$Literate/dat$Pop1931

## Contemporary dependent variables ###
dat$pctYes2003<-dat$yes2003/dat$ValidVotes
dat$pctVoteLPR<-dat$LPRvote200/dat$voted2001

### Contemporary socio-economic variables
dat$pctUrban2002<-dat$wmiastach2/dat$totPop2002
dat$shareFarm2002<-dat$Rolniki200/dat$totPop2002
dat$PctPrivEmpl2002<-dat$PrivatEmpl/dat$TotEmploye
dat$incTaxpc2002<-dat$incTax2002/dat$totPop2002
dat$persincTaxpc2002<-dat$persIncTax/dat$totPop2002
dat$recentLogInc<-log(dat$persincTaxpc2002)-mean(log(dat$persincTaxpc2002))

dat$Partition[dat$Austr==1]<-"Austr"
dat$Partition[dat$Rus==1]<-"Rus"
dat$Partition[dat$Prus==1]<-"Prus"
dat$Partition<-factor(dat$Partition)

dat$logpop = log(dat$Pop1931)


#########################################################################
######### Sequential-G Extensions #######################################
#########################################################################

### G-estimation with anti-Semitic vote
##Intermediate covariates: post-treatment to share Jewish (1931) 

dat$sharePriv2000<-dat$PrivSecEmpl2000/dat$TotEmpl2000
dat$shareOld2000<-dat$PostWorkAgePop2000/dat$TotPop2000
dat$shareUrb2000<-dat$UrbanPop2000/dat$TotDomicile2000

## first - Have M as BOTH pctvoteLPR and longInc, Have Z as present vars
### First stage 
g_stage1 <-lm(pctYes2003 ~ pctJews+pctVoteLPR + recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition+ shareUrb2000+shareOld2000+sharePriv2000, data = dat)
summary(g_stage1) 

### Second stage - demean by subtracting both M variables
g_stage2 <-lm(I(pctYes2003 - coef(g_stage1)["pctVoteLPR"]* pctVoteLPR - coef(g_stage1)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat)
summary(g_stage2) 

## bootstrap the SEs
set.seed(02138)
boots <- 1000
fl.boots <- matrix(nrow = boots, ncol = length(coef(g_stage2)))
colnames(fl.boots) = variable.names(g_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctYes2003 ~ pctJews+pctVoteLPR + recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition+ shareUrb2000+shareOld2000+sharePriv2000, data = dat.star)
  boot.direct <-lm(I(pctYes2003 - coef(boot.first)["pctVoteLPR"]* pctVoteLPR - coef(boot.first)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat.star)
  fl.boots[i,]<-coef(boot.direct)
}

SE1 = apply(fl.boots, MARGIN = 2,sd) #this varies a bit with each boot-strap


## second - get rid of Z, but keep both Ms (recent log inc)
### First stage 
g2_stage1 <-lm(pctYes2003 ~ pctJews+pctVoteLPR + recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data = dat)
summary(g2_stage1) 

### Second stage - demean by subtracting both M variables
g2_stage2 <-lm(I(pctYes2003 - coef(g2_stage1)["pctVoteLPR"]* pctVoteLPR - coef(g2_stage1)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat)
summary(g2_stage2) 

## bootstrap the SEs
fl.boots_2 <- matrix(nrow = boots, ncol = length(coef(g2_stage2)))
colnames(fl.boots_2) = variable.names(g2_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctYes2003 ~ pctJews+pctVoteLPR + recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data = dat.star)
  boot.direct <-lm(I(pctYes2003 - coef(boot.first)["pctVoteLPR"]* pctVoteLPR - coef(boot.first)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat.star)
  fl.boots_2[i,]<-coef(boot.direct)
}

SE2 = apply(fl.boots_2, MARGIN = 2,sd) #this varies a bit with each boot-strap

## Third - have only LPR as M, no Z
### First stage 
g3_stage1 <-lm(pctYes2003 ~ pctJews+pctVoteLPR  +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data = dat)
summary(g3_stage1) 

### Second stage - demean by subtracting both M variables
g3_stage2 <-lm(I(pctYes2003 - coef(g3_stage1)["pctVoteLPR"]* pctVoteLPR)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat)
summary(g3_stage2) 

## bootstrap the SEs
fl.boots_3 <- matrix(nrow = boots, ncol = length(coef(g3_stage2)))
colnames(fl.boots_3) = variable.names(g3_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctYes2003 ~ pctJews+pctVoteLPR + pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data = dat.star)
  boot.direct <-lm(I(pctYes2003 - coef(boot.first)["pctVoteLPR"]* pctVoteLPR )~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat.star)
  fl.boots_3[i,]<-coef(boot.direct)
}

SE3 = apply(fl.boots_3, MARGIN = 2,sd) #this varies a bit with each boot-strap


## Fourth - Have only Inc as M, no Z
### First stage 
g4_stage1 <-lm(pctYes2003 ~ pctJews + recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data = dat)
summary(g4_stage1) 

### Second stage - demean by subtracting both M variables
g4_stage2 <-lm(I(pctYes2003 - coef(g2_stage1)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat)
summary(g4_stage2) 

## bootstrap the SEs
fl.boots_4 <- matrix(nrow = boots, ncol = length(coef(g4_stage2)))
colnames(fl.boots_4) = variable.names(g4_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctYes2003 ~ pctJews+ recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data = dat.star)
  boot.direct <-lm(I(pctYes2003  - coef(boot.first)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat.star)
  fl.boots_4[i,]<-coef(boot.direct)
}

SE4 = apply(fl.boots_4, MARGIN = 2,sd) #this varies a bit with each boot-strap

## Fifth - have only LPR as M, include Z
### First stage 
g5_stage1 <-lm(pctYes2003 ~ pctJews+pctVoteLPR  +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition+ shareUrb2000+shareOld2000+sharePriv2000, data = dat)
summary(g5_stage1) 

### Second stage - demean by subtracting both M variables
g5_stage2 <-lm(I(pctYes2003 - coef(g5_stage1)["pctVoteLPR"]* pctVoteLPR)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition , data= dat)
summary(g5_stage2) 

## bootstrap the SEs
fl.boots_5 <- matrix(nrow = boots, ncol = length(coef(g5_stage2)))
colnames(fl.boots_5) = variable.names(g5_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctYes2003 ~ pctJews+pctVoteLPR + pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition + shareUrb2000+shareOld2000+sharePriv2000, data = dat.star)
  boot.direct <-lm(I(pctYes2003 - coef(boot.first)["pctVoteLPR"]* pctVoteLPR )~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat.star)
  fl.boots_5[i,]<-coef(boot.direct)
}

SE5 = apply(fl.boots_5, MARGIN = 2,sd) #this varies a bit with each boot-strap

## Six - Have only Inc as M, include Z
### First stage 
g6_stage1 <-lm(pctYes2003 ~ pctJews + recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition + shareUrb2000+shareOld2000+sharePriv2000, data = dat)
summary(g6_stage1) 

### Second stage - demean by subtracting both M variables
g6_stage2 <-lm(I(pctYes2003 - coef(g6_stage1)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat)
summary(g6_stage2) 

## bootstrap the SEs
fl.boots_6 <- matrix(nrow = boots, ncol = length(coef(g6_stage2)))
colnames(fl.boots_6) = variable.names(g6_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctYes2003 ~ pctJews+ recentLogInc +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition + shareUrb2000+shareOld2000+sharePriv2000, data = dat.star)
  boot.direct <-lm(I(pctYes2003  - coef(boot.first)["recentLogInc"]*recentLogInc)~ pctJews+ +pctCath+pctUrb+ pctLit+ pctRoln+log(Pop1931)+ Partition, data= dat.star)
  fl.boots_6[i,]<-coef(boot.direct)
}

SE6 = apply(fl.boots_6, MARGIN = 2,sd) #this varies a bit with each boot-strap

#present the "new" ones we did
stargazer(g_stage1, g_stage2, g2_stage1, g2_stage2, no.space = TRUE, se = list(NULL, SE1, NULL, SE2), dep.var.labels = c("EU Vote", "De-Meaned EU Vote", "EU Vote", "De-Meaned EU Vote"), omit.stat = "f")

#show that pctJewish in 1931 doesn't predict logInc today
incmodel = lm(recentLogInc ~ pctJews + pctCath + pctUrb + pctLit + pctRoln + log(Pop1931) + Partition, data = dat)
summary(incmodel)

stargazer(incmodel)

###########################################################################
###################### Predicted Val Plot - g_stage1 ######################
#############################################################################
library(mvtnorm)
set.seed(02138)

#simulate betas 
betas_EU = coef(g_stage1)
sim.beta.eu = rmvnorm(5000, mean = betas_EU, sigma = vcov(g_stage1))

# Sequence where vary pctVoteLPR from min to max 
lpr_seq = seq(from = min(dat$pctVoteLPR), to = max(dat$pctVoteLPR), by = .001)

# Create data frame to store values
eu_ci = data.frame(lower = rep(NA, length(lpr_seq)), mid = rep(NA, length(lpr_seq)), upper = rep(NA, length(lpr_seq)))
dat_work_eu = model.matrix(g_stage1)

#yi for all x_observed vectors we have and then average over them
for (i in 1:length(lpr_seq)){
  dat_work_eu[,3] = lpr_seq[i]
  yi_j = colMeans(apply(sim.beta.eu, MARGIN = 1, FUN = function(sim.beta.eu){((as.matrix(dat_work_eu)) %*% (sim.beta.eu))}))
  eu_ci[i,] = quantile(yi_j, probs = c(.025, .5, .975)) #store quantiles 
}

#bind jew sequence to quantiles 
eu_ci = cbind(lpr_seq, eu_ci)

#try again holding values at median
x_med_eu = apply(dat_work_eu, MARGIN = 2, median)

# create data frame to store values
eu_ci_med = data.frame(lower = rep(NA, length(lpr_seq)), mid = rep(NA, length(lpr_seq)), upper = rep(NA, length(lpr_seq)))

# have a vector of x_medians and replace x_pctVoteLPR with particular value in our pctVoteLPR sequence
# calculate Yi using our beta sims, then find quantiles, then store them 
for (i in 1:length(lpr_seq)){
  x_med_eu[3] = lpr_seq[i]
  yi = apply(sim.beta.eu, MARGIN = 1, FUN = function(sim.beta.eu){(t(as.matrix(x_med_eu)) %*% (sim.beta.eu))})
  eu_ci_med[i,] = quantile(yi, probs = c(.025, .5, .975))
}

# bind income to the quantiles
eu_ci_med = cbind(lpr_seq, eu_ci_med)

#plot
plot_pred_EU_both = ggplot(eu_ci) + 
  geom_line(aes(x=lpr_seq, y = mid, color = 'black')) + 
  geom_ribbon(aes(x=lpr_seq, ymin=lower, ymax=upper), fill = 'red', alpha = .25) + 
  xlab('pctVoteLPR') + 
  ylab('Predicted pct "YES" EU Vote') + 
  geom_line(data = eu_ci_med, aes(x=lpr_seq, y = mid, group = "X at Median", color = 'blue')) + 
  geom_ribbon(data = eu_ci_med, aes(x=lpr_seq, ymin=lower, ymax=upper), alpha = .25, fill = 'cyan') + 
  scale_color_discrete(labels = c("X at Observed", "X at Median"))
plot_pred_EU_both


###########################################################################
###################### Table 5: Pogrom data -- With G Est ##################
#############################################################################
miasta<-read.csv("pogroms.csv")
miasta$shareJewish<-miasta$Jewish/miasta$Pop.1921
miasta$shareCatholic<-miasta$Catholic/miasta$Pop.1921
miasta$shareLPR<-miasta$LPRnum2001/miasta$ValidVotes2001
miasta$proEU<-miasta$Yes2003/miasta$ValidVotes2003

##Her Model share Jewish is associated with higher LPR support
reg4<-lm(shareLPR ~ shareJewish+ shareCatholic +log(Pop.1921), data=miasta)
summary(reg4)

# Model with M as Ag, Y as LPR Vote
## our stage 1
dat$shareag2000 = 1 - dat$shareUrb2000 #create 2000 share ag variable
cor(dat$shareFarm2002, dat$shareag2000) #show that highly correlated to share farm
g7_stage1 = lm(pctVoteLPR ~ pctJews + shareag2000  + pctCath + pctUrb + pctLit + pctRoln + logpop + Partition  + shareOld2000 + sharePriv2000, data = dat)
summary(g7_stage1)

#Stage 2
g7_stage2 = lm(I(pctVoteLPR - coef(g7_stage1)["shareag2000"]*shareag2000 ) ~ pctJews + pctCath + pctUrb + pctLit + pctRoln + logpop + Partition, data = dat)
summary(g7_stage2)

## bootstrap the SEs
fl.boots_7 <- matrix(nrow = boots, ncol = length(coef(g7_stage2)))
colnames(fl.boots_7) = variable.names(g7_stage2)

for (i in 1: boots){
  dat.star<-dat[sample(1:nrow(dat), replace=TRUE),]
  boot.first <- lm(pctVoteLPR ~ pctJews + shareag2000  + pctCath + pctUrb + pctLit + pctRoln + logpop + Partition  + shareOld2000 + sharePriv2000, data = dat.star)
  boot.direct <-lm(I(pctVoteLPR - coef(g7_stage1)["shareag2000"]*shareag2000) ~ pctJews + pctCath + pctUrb + pctLit + pctRoln + logpop + Partition,  data= dat.star)
  fl.boots_7[i,]<-coef(boot.direct)
}

SE7 = apply(fl.boots_7, MARGIN = 2,sd) #this varies a bit with each boot-strap

#present findings 
stargazer(g7_stage1, g7_stage2, se = list(NULL, SE7), no.space = TRUE)

###########################################################################
###################### G Est Predicted pctLPR Visual  #######################
#############################################################################
library(mvtnorm)
set.seed(02138)

#simulate betas 
betas_Gest = coef(g7_stage2)
sim.beta = rmvnorm(5000, mean = betas_Gest, sigma = vcov(g7_stage2))

# Sequence where vary pctJews from min to max 
jew_seq = seq(from = min(dat$pctJews), to = max(dat$pctJews), by = .0025)

# Create data frame to store values
lpr_ci = data.frame(lower = rep(NA, length(jew_seq)), mid = rep(NA, length(jew_seq)), upper = rep(NA, length(jew_seq)))
dat_work = model.matrix(g7_stage2)

#yi for all x_observed vectors we have and then average over them
for (i in 1:length(jew_seq)){
  dat_work[,2] = jew_seq[i]
  yi_j = colMeans(apply(sim.beta, MARGIN = 1, FUN = function(sim.beta){((as.matrix(dat_work)) %*% (sim.beta))}))
  lpr_ci[i,] = quantile(yi_j, probs = c(.025, .5, .975)) #store quantiles 
}

#bind jew sequence to quantiles 
lpr_ci = cbind(jew_seq, lpr_ci)

#plot
plot_predict = ggplot(lpr_ci) + 
  geom_line(aes(x=jew_seq, y = mid, color = 'black')) + 
  geom_ribbon(aes(x=jew_seq, ymin=lower, ymax=upper), fill = 'red', alpha = .25) + 
  xlab('pctJews 1931') + 
  ylab('Predicted pctLPR Vote')
plot_predict_LPR

#try again holding values at median
x_med = apply(dat_work, MARGIN = 2, median)

# create data frame to store values
lpr_ci_med = data.frame(lower = rep(NA, length(jew_seq)), mid = rep(NA, length(jew_seq)), upper = rep(NA, length(jew_seq)))

# have a vector of x_medians and replace x_pctJews with particular value in our pctJews sequence
# calculate Yi using our beta sims, then find quantiles, then store them 
for (i in 1:length(jew_seq)){
  x_med[2] = jew_seq[i]
  yi = apply(sim.beta, MARGIN = 1, FUN = function(sim.beta){(t(as.matrix(x_med)) %*% (sim.beta))})
  lpr_ci_med[i,] = quantile(yi, probs = c(.025, .5, .975))
}

# bind income to the quantiles
lpr_ci_med = cbind(jew_seq, lpr_ci_med)

#plot
plot_pred_LPR_both = ggplot(lpr_ci) + 
  geom_line(aes(x=jew_seq, y = mid, color = 'black')) + 
  geom_ribbon(aes(x=jew_seq, ymin=lower, ymax=upper), fill = 'red', alpha = .25) + 
  xlab('pctJews 1931') + 
  ylab('Predicted pctLPR Vote') + 
  geom_line(data = lpr_ci_med, aes(x=jew_seq, y = mid, group = "X at Observed", color = 'blue')) + 
  geom_ribbon(data = lpr_ci_med, aes(x=jew_seq, ymin=lower, ymax=upper), alpha = .25, fill = 'cyan') + 
  scale_color_discrete(labels = c("X at Observed", "X at Median"))
plot_pred_LPR_both


